Setup

library(tidyverse)
## Warning in system("timedatectl", intern = TRUE): running command 'timedatectl'
## had status 1
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.4     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(cowplot)
all_counts <- readRDS(file= "Data/pc_count_matrix.rds")
avg_counts <- readRDS(file="Data/avg_pc_count_matrix.rds")
diff_counts <- readRDS(file="Data/diff_pc_count_matrix.rds")
meta <- suppressMessages(readr::read_tsv("Data/complete_meta_data.tsv"))
meta <- meta %>% mutate(tissue_type=replace(tissue_type, tissue_type=="embryonic facial prominence", "facial prominence"))
meta <- meta %>% mutate(tissue_type=replace(tissue_type, tissue_type=="skeletal muscle tissue", "muscle tissue"))
meta <- meta %>% mutate(tissue_type=replace(tissue_type, tissue_type=="skeletal muscle tissue", "muscle tissue"))
tissue_types <- unique(meta %>% pull(tissue_type))
collapsed_dev_counts <- matrix(data=NA, nrow=nrow(all_counts),ncol=length(tissue_types))
walker <- 1
for (tissue in tissue_types){
  tissue_subset_ids <- meta %>% filter(tissue_type == tissue) %>% pull(id)
  tissue_subset_count <- all_counts[,tissue_subset_ids]
  tissue_avg <- rowMeans(tissue_subset_count)
  collapsed_dev_counts[,walker] <- tissue_avg
  walker <- walker + 1
}
colnames(collapsed_dev_counts) <- tissue_types
rownames(collapsed_dev_counts) <- rownames(all_counts)
TFs<- c("Ascl1", "Hes1", "Neurod1", "Mecp2", "Mef2c", "Runx1", "Tcf4", "Pax6")

PCA coloured by dev_stage

##Remove zero variance 
all_counts_drop_var <- (t(all_counts))[, which(apply(t(all_counts),2,var) !=0)]
all_counts.pca <- prcomp(all_counts_drop_var, center = TRUE,scale. = TRUE)
library(ggfortify)
library(ggrepel)
library(RColorBrewer)
autoplot(all_counts.pca, data=meta, colour="dev_stage", label.size = 3, size=5) + theme_cowplot(12) + scale_color_brewer(palette = "RdYlBu") 

PCA coloured by tissue

getPalette = colorRampPalette(brewer.pal(12, "Paired"))
autoplot(all_counts.pca, data=meta, colour="tissue_type", label.size = 3, size=5) + theme_cowplot(12) + scale_color_manual(values = getPalette(length(tissue_types))) 

Pearson correlation heatmap

library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
all_counts_pear <- all_counts
colnames(all_counts_pear) <- sapply(colnames(all_counts), function(x){temp_row <- meta %>% filter(id==x) 
                                                                             return(paste0(temp_row %>% pull(tissue_type), " ",temp_row %>% pull(dev_stage)))})
pear_rcorr_object <- rcorr(all_counts_pear, type = "pearson")
pear_matrix <- pear_rcorr_object$r
pear_matrix_pvalues <- pear_rcorr_object$P
library(corrplot)
## corrplot 0.90 loaded
corrplot(pear_matrix, method = 'shade', type = 'lower', diag = FALSE, col = rep(rev(brewer.pal(n=8, name="RdYlBu")),2), col.lim=c(0,1))

Pearson correlation heatmap (collapsed by samples)

library(corrplot)
library(Hmisc)
avg_counts_pear <- avg_counts
colnames(avg_counts_pear) <- sapply(colnames(avg_counts), function(x){temp_row <- meta %>% filter(id==x) 
                                                                             return(paste0(temp_row %>% pull(tissue_type), " ",temp_row %>% pull(dev_stage)))})
pear_rcorr_object <- rcorr(avg_counts_pear, type = "pearson")
pear_matrix <- pear_rcorr_object$r
pear_matrix_pvalues <- pear_rcorr_object$P



corrplot(pear_matrix, method="shade", type="lower", diag=FALSE, col = rep(rev(brewer.pal(n=8, name="RdYlBu")),2), col.lim=c(0,1))

Pearson correlation heatmap (collapsed by samples) 17 clusters (because there are 17 tissues)

library(corrplot)
library(Hmisc)
avg_counts_pear <- avg_counts
colnames(avg_counts_pear) <- sapply(colnames(avg_counts), function(x){temp_row <- meta %>% filter(id==x) 
                                                                             return(paste0(temp_row %>% pull(tissue_type), " ",temp_row %>% pull(dev_stage)))})
pear_rcorr_object <- rcorr(avg_counts_pear, type = "pearson")
pear_matrix <- pear_rcorr_object$r
pear_matrix_pvalues <- pear_rcorr_object$P
testRes = cor.mtest(avg_counts_pear, conf.level = 0.95)


corrplot(pear_matrix, order = 'hclust', addrect = 17, method="shade", rect.col = 'black', col = rep(rev(brewer.pal(n=8, name="RdYlBu")),2), col.lim=c(0,1))

Pearson correlation heatmap (collapsed by samples) 8 clusters (because 8 unique dev stages)

library(corrplot)
library(Hmisc)
avg_counts_pear <- avg_counts
colnames(avg_counts_pear) <- sapply(colnames(avg_counts), function(x){temp_row <- meta %>% filter(id==x) 
                                                                             return(paste0(temp_row %>% pull(tissue_type), " ",temp_row %>% pull(dev_stage)))})
pear_rcorr_object <- rcorr(avg_counts_pear, type = "pearson")
pear_matrix <- pear_rcorr_object$r
pear_matrix_pvalues <- pear_rcorr_object$P
testRes = cor.mtest(avg_counts_pear, conf.level = 0.95)


corrplot(pear_matrix, order = 'hclust', addrect = 8, method="shade", rect.col = 'black', col = rep(rev(brewer.pal(n=8, name="RdYlBu")),2), col.lim=c(0,1))

Prepare Limma (with individual comparison)

# Demonstration of using limma to find differentially expressed genes

library(limma)
library(tidyverse)
library(edgeR)
library(tximport)

# load data
meta <- read.delim("Data/complete_meta_data.tsv", stringsAsFactors = FALSE)
pc <- read.delim("Data/pc_sub.tsv", stringsAsFactors = FALSE)
files <- paste0("Data/Count_tables/", meta$id, ".tsv")
counts <- tximport(files, type = "rsem", txIn = FALSE)
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
## 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57
## 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84
## 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
## 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
## 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148
## 149 150 151 152 153 154 155 156
count_mat <- counts$counts

# name matrix and only keep protein coding genes
rownames(count_mat) <- str_replace(rownames(count_mat), "\\.[:digit:]*$", "")
colnames(count_mat) <- meta$id
count_mat <- count_mat[rownames(count_mat) %in% pc$Gene_ID, ]
oldrownames <- data.frame(rownames(count_mat)) 
oldrownames[,"Gene_ID"] <- oldrownames[,1]
rownames(count_mat) <- oldrownames %>% left_join(pc,"Gene_ID") %>% pull(Symbol)

# order by tissue and dev stage
meta <- meta %>% group_by(tissue_type) %>% arrange(dev_stage, .by_group = TRUE)
count_mat <- count_mat[, meta$id]
stopifnot(identical(colnames(count_mat), meta$id))

# build design matrix
design <- model.matrix(~ 0 + meta$dev_stage + meta$tissue_type)
#design <- model.matrix(~ meta$tissue_type * meta$dev_stage)

# edgeR: remove lowly expressed, scale normalize, and the log2 CPM transform
dge <- DGEList(count_mat)
keep <- filterByExpr(dge, design)
dge <- dge[keep, ]

# gene counts of removed
removed <- rowSums(count_mat[!keep,])
summary(removed)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    1.00    5.00   24.36   22.41  851.45
count_mat[which.max(removed), ]
## ENCFF465YOS ENCFF516EUX ENCFF929KWG ENCFF672DDJ ENCFF772UWT ENCFF262TXH 
##         217         148          29          44          27          22 
## ENCFF146HIO ENCFF129XUH ENCFF132NQU ENCFF867TKM ENCFF924CMS ENCFF370UDF 
##          30          33          19          17          25          45 
## ENCFF369TLJ ENCFF536XKZ ENCFF302TQO ENCFF413BXV ENCFF465SNB ENCFF976OLT 
##          24          33          19          12          17          17 
## ENCFF393ZAF ENCFF488HRD ENCFF567AFL ENCFF227HKF ENCFF745ZJF ENCFF816CVP 
##          16          21          16          13          32          14 
## ENCFF763GXJ ENCFF340XFQ ENCFF590FAC ENCFF484AOO ENCFF918QNL ENCFF895JXR 
##          23          23          10          17          34          24 
## ENCFF127FPD ENCFF721XZC ENCFF226IWR ENCFF540EJL ENCFF125TAY ENCFF824DCQ 
##          79          58          81          72          44          38 
## ENCFF242GMD ENCFF976CYB ENCFF111IGW ENCFF540BJT ENCFF440PWB ENCFF219PVC 
##          81          55          60          63          85          48 
## ENCFF415JBI ENCFF871IGQ ENCFF817KPY ENCFF155GNG ENCFF070GIC ENCFF097IDM 
##          50          58          60          41          36          42 
## ENCFF750FTK ENCFF109HTF ENCFF184ZAV ENCFF684GSP ENCFF131WIM ENCFF604LWF 
##          74          53          39          32          26          31 
## ENCFF206KRT ENCFF741FZD ENCFF395LAH ENCFF338ZXD ENCFF211ELX ENCFF830YBR 
##          43          18          31          30          34          44 
## ENCFF798MSP ENCFF861GUP ENCFF959OHE ENCFF228SAS ENCFF052THP ENCFF114YCL 
##          32          46          16          28          17          35 
## ENCFF443JRH ENCFF278PAQ ENCFF485CJB ENCFF795XBQ ENCFF499WRT ENCFF413OJO 
##          23          36          21          21         120         128 
## ENCFF700YRC ENCFF347HOK ENCFF752QKG ENCFF143OJZ ENCFF905HUL ENCFF783LVC 
##         122         129         120         162         257         326 
## ENCFF399HJB ENCFF597VHC ENCFF195JHC ENCFF457ZGF ENCFF982ERQ ENCFF449QSP 
##          22          51          34          32          20          27 
## ENCFF358WYS ENCFF634AUL ENCFF677BPV ENCFF794QMH ENCFF532ZDE ENCFF003DBZ 
##          30          22          28          15          29          15 
## ENCFF954EHG ENCFF523MEO ENCFF669WDQ ENCFF997HBI ENCFF615ZTQ ENCFF336VTP 
##          38          28          26          20          20          22 
## ENCFF432ZGG ENCFF572OPZ ENCFF740SXP ENCFF504YJB ENCFF759PUL ENCFF512KYX 
##           5           8           8           7          11           4 
## ENCFF875HIA ENCFF143HKK ENCFF872ABP ENCFF226ILJ ENCFF718PJC ENCFF996EMC 
##          10          20          50          48          38          43 
## ENCFF538RNR ENCFF365SND ENCFF990GIB ENCFF658LQM ENCFF996YFF ENCFF421TRR 
##          54          40          70          63          23          37 
## ENCFF359ZOA ENCFF971KZC ENCFF168CVY ENCFF390KQM ENCFF422BJI ENCFF196WAD 
##          57          59          15          20          24          31 
## ENCFF453GZG ENCFF870YWY ENCFF706XGJ ENCFF835FSF ENCFF918YFP ENCFF052VJO 
##          16          19          24          33          42          23 
## ENCFF210MWH ENCFF793WMU ENCFF375JDR ENCFF298WHK ENCFF532DDR ENCFF627COG 
##          45          35           8          24          11          20 
## ENCFF502BTV ENCFF049EIV ENCFF513HAL ENCFF967SJG ENCFF037GWJ ENCFF365DLM 
##          20           9          10           8          25          29 
## ENCFF402XKL ENCFF918XYP ENCFF576YVC ENCFF480GNL ENCFF600ZQX ENCFF871WCS 
##          20          31          23          25          22          35 
## ENCFF050PAT ENCFF691EQW ENCFF972NMO ENCFF355MOU ENCFF288JNN ENCFF052DOQ 
##          26          22          35          40          59          23 
## ENCFF517XLT ENCFF434XNZ ENCFF356QFG ENCFF319WZT ENCFF385MJV ENCFF923YGS 
##          41          46          15           9          25          16
# scale normalization using TMM method (default)
dge <- calcNormFactors(dge)

# convert counts to log2CPM 
cpm_counts <- cpm(dge, log = TRUE, prior.count = 3)

# fit model with limma eBayes trend 
fit <- lmFit(cpm_counts, design)
fit <- eBayes(fit, trend=TRUE)

# pulling specific contrasts
test_age <- topTable(fit, number = Inf, coef = "meta$dev_stageP0")
test_tissue <- topTable(fit, number = Inf, coef = "meta$tissue_typethymus")
#ggplot(data=test_age, aes(x = -log2(adj.P.Val))) + geom_histogram(binwidth = 0.1) + geom_vline(xintercept = -log(0.05), linetype="dotted", colour="red")
ggplot(data=test_age, aes(x = adj.P.Val)) + geom_histogram(binwidth = 0.01)

for (i in 1:5){
  print(paste0("The ", i ," differentially expressed genes in dev stage 0:  ", arrange(test_age, desc(logFC))[i,1], "name is:  ", rownames(arrange(test_age, desc(logFC)))[i]))
}
## [1] "The 1 differentially expressed genes in dev stage 0:  13.4426598341768name is:  mt-Atp6"
## [1] "The 2 differentially expressed genes in dev stage 0:  13.1775034528567name is:  mt-Co1"
## [1] "The 3 differentially expressed genes in dev stage 0:  12.5060148981273name is:  Hsd3b1"
## [1] "The 4 differentially expressed genes in dev stage 0:  12.3340004747955name is:  mt-Co3"
## [1] "The 5 differentially expressed genes in dev stage 0:  12.2950949067205name is:  Cyp21a1"
volcanoplot(fit,coef=5)